home *** CD-ROM | disk | FTP | other *** search
- #include <stdio.h>
- #include <ctype.h>
- #include "util.h"
- #include "combine.h"
- /*
- * pass4: Handle non-unique record clusters.
- *
- * This routine tries to fix up problems which arise due to groups of
- * non-unique records which are surrounded by insertions. When this
- * condition arises, the group of non-unique records are treated as
- * a part of the inserted text.
- *
- * This routine finds a pair of anchor records which have inserted lines
- * between them. (See diagram below.) All inserted lines are compared
- * to one another trying to find a group of non-unique lines which
- * match. Such lines are considered equal and are flagged as anchor
- * points.
- *
- * match1 match1
- * insert
- * non-unique1 non-unique1
- * insert
- * match2 match2
- *
- * This routine runs in M x N time where 'M' is the number of lines
- * inserted at the current position in the first file and 'N' is the number
- * of lines inserted at the current position in the second file.
- *
- * Possible design flaw: The algorithm used to match the non-unique lines
- * should really be the same alogirthm used for the rest of the file.
- * That algorithm runs in linear time. The main problem with doing that is
- * that the other passes of this program were not designed for or optimized
- * around working for a portion of a file. Also, the proposed algorithm
- * relies on the fact that some of the non-unique lines will become unique
- * when only this fragment of the files are compared.
- *
- * Return value:
- * This procedure has no return value.
- */
-
- void pass4 () {
-
- int i; /* Misc. variable */
-
- int j; /* Misc. variable */
-
- /*
- * Scan each pair of files.
- */
-
- for (j = 0; j < UNIQUE_MATCH_COUNT; ++j) {
-
- i = unique_match[j];
-
- if (files[curr_file[i]].record != 0 &&
- files[corres_file[i]].record != 0) {
-
- pass4_scan (i);
-
- }
-
- }
-
- }
- /*
- * pass4_scan: pass4 for two files.
- *
- * This routine implements the pass4 algorithm for a pair of files.
- *
- * Return value:
- * This procedure has no return value.
- */
-
- void pass4_scan (match_no)
- int match_no; /* input */
- /* Which relationship is being scanned */
-
- {
-
- register int i; /* Index into record array of file1 of the
- current position in the inserted records */
-
- register int j; /* Index into record array of file2 of the
- current position in the inserted records */
-
- file_type * file1_ptr; /* First file - current_file */
-
- file_type * file2_ptr; /* Second file - corresponding file */
-
- int file1_sub; /* For each record of the first file, this is a
- subscript of the 'value' array of the
- relationship between file1 and file2 */
-
- int file2_sub; /* For each record of the second file, this is
- a subscript of the 'value' array of the
- relationship between file2 and file1 */
-
- register int fn_index1; /* Index into record array of file1 of the
- position of the last inserted record */
-
- register int fn_index2; /* Index into record array of file2 of the
- position of the last inserted record */
-
- int index1; /* Index into record array of file1 of the
- record currently being processed */
-
- record_type * record1; /* Pointer to record array of file1 */
-
- record_type * record2; /* Pointer to record array of file2 */
-
- register int st_index1; /* Index into record array of file1 of the
- position of the first inserted record */
-
- register int st_index2; /* Index into record array of file2 of the
- position of the first inserted record */
-
-
- /*
- * Initialize indexes to first and last record of the files.
- */
-
- file1_ptr = &files[curr_file[match_no]];
- file2_ptr = &files[corres_file[match_no]];
- file1_sub = value_sub[match_no];
- file2_sub = rev_value_sub[match_no];
-
- record1 = file1_ptr -> record;
- record2 = file2_ptr -> record;
-
- /*
- * For each record in the first file.
- *
- * Note: given the existence of the 'begin' and 'end' record on each
- * end of the record array,
- * we can convince ourselves that the tests below won't ever index
- * off the end of the arrays.
- */
- for (index1 = BEGIN_INDEX; index1 < file1_ptr->record_array_size-1;
- index1++) {
-
- /*
- * If the current record in file1 is not an anchor point,
- * continue the big loop.
- */
- st_index2 = record1[index1].value[file1_sub];
- if (is_hash_code (st_index2)) {
- continue;
- }
-
- /*
- * Compute the index into file2 of the corresponding record.
- * Compute a pointer to value field of the next record in each
- * file.
- *
- * If either of the next records are anchor points,
- * continue the big loop.
- */
- if (!is_hash_code( record1[index1 + 1].value[file1_sub]))
- continue;
- if (!is_hash_code( record2[st_index2 + 1].value[file2_sub]))
- continue;
-
- /*
- * We have found the point where a group of inserted records
- * begins in both files.
- *
- * Set up indexes to the first and last inserted records in
- * the group.
- */
- st_index1 = fn_index1 = index1 + 1;
- st_index2 = st_index2 + 1;
- while (is_hash_code (record1[fn_index1].value[file1_sub])) {
- ++fn_index1;
- }
- fn_index2 = record1[fn_index1].value[file1_sub];
- fn_index1--; /* Adjust to index to last inserted record */
- fn_index2--; /* Adjust to index to last inserted record */
-
- /*
- * If this group of inserted records is adjacent to a group of
- * moved records, skip the comparison. This isn't the case
- * we're interested in.
- */
- index1 = fn_index1;
- if (st_index2 > fn_index2) {
- continue;
- }
-
- /*
- * Ensure that only non-unique records exist in the range
- * of the corresponding file. This catches a move in the
- * other direction.
- */
- for ( j=st_index2; j<=fn_index2; ++ j){
- if (!is_hash_code (record2[j].value[file2_sub]))
- break;
- }
- if ( j <= fn_index2 )
- continue;
-
- /*
- * Compare the hash codes. If two records are identical,
- * consider them to be anchor points.
- */
- if ( p4_debug ) {
- printf("pass4: match_no: %d (%d-%d) (%d-%d)\n",
- match_no, st_index1, fn_index1,
- st_index2, fn_index2);
- }
-
- for (i = st_index1; i<=fn_index1; ++i) {
- for (j=st_index2; j<=fn_index2; ++j) {
-
- if (record1[i].value[file1_sub] ==
- record2[j].value[file2_sub]) {
-
- link_records(match_no,i, j);
- st_index2 = j + 1;
- break;
-
- }
-
- }
- }
- }
-
- }
-